Sampling

Prof. Dr. Jörg Schoder

2023-05-29

Urnenmodell

Quelle: moderndive.com

Zufallsexperiment und Stichprobenziehung

Stichprobenziehung

Ergebnis einer Stichprobe

Ergebnis mehrerer Stichproben

Datensatz zum physischen Experiment

library(moderndive)
tactile_prop_red
## # A tibble: 33 × 4
##    group            replicate red_balls prop_red
##    <chr>                <int>     <int>    <dbl>
##  1 Ilyas, Yohan             1        21     0.42
##  2 Morgan, Terrance         2        17     0.34
##  3 Martin, Thomas           3        21     0.42
##  4 Clark, Frank             4        21     0.42
##  5 Riddhi, Karina           5        18     0.36
##  6 Andrew, Tyler            6        19     0.38
##  7 Julia                    7        19     0.38
##  8 Rachel, Lauren           8        11     0.22
##  9 Daniel, Caroline         9        15     0.3 
## 10 Josh, Maeve             10        17     0.34
## # ℹ 23 more rows

Stichprobenverteilung

library(tidyverse)
ggplot(tactile_prop_red, aes(x = prop_red)) +
  geom_histogram(binwidth = 0.05, boundary = 0.4,
                 color = "white") +
  scale_y_continuous(limits = c(0, 10), breaks = c(0:10)) +
  labs(x = "Anteil roter Kugeln aus insgesamt 50 (roten und weißen) Kugeln",
       y="Anzahl",
       title = "Verteilung von 33 Anteilswerten roter Kugeln")

Wahre Verteilung

Daten

bowl
## # A tibble: 2,400 × 2
##    ball_ID color
##      <int> <chr>
##  1       1 white
##  2       2 white
##  3       3 white
##  4       4 red  
##  5       5 white
##  6       6 white
##  7       7 red  
##  8       8 white
##  9       9 red  
## 10      10 white
## # ℹ 2,390 more rows

Anzahl und Anteil roter Kugeln

red_true <- bowl %>%
               summarize(Anzahl_rot = sum(color == "red"),
                     Anteil_rot = sum(color == "red")/length(color)
            )
red_true  %>%
    mutate(Anteil_rot=paste0(Anteil_rot*100,"%"))
## # A tibble: 1 × 2
##   Anzahl_rot Anteil_rot
##        <int> <chr>     
## 1        900 37.5%

(Virtuelles) Sampling und Punktschätzung

Unterschiedliche Schaufelgrößen

“Kleine Schaufel” (Stichprobengröße n = 25)

n <- 25
rep <- 1000
virtual_samples_25 <- bowl %>%
                        rep_sample_n(size = n,
                                     reps = rep)
virtual_samples_25
## # A tibble: 25,000 × 3
## # Groups:   replicate [1,000]
##    replicate ball_ID color
##        <int>   <int> <chr>
##  1         1    2227 red  
##  2         1    1774 red  
##  3         1     449 white
##  4         1    1607 white
##  5         1     305 white
##  6         1    1170 white
##  7         1     491 red  
##  8         1    1576 white
##  9         1    1663 white
## 10         1    1950 white
## # ℹ 24,990 more rows
virtual_prop_red_25 <- virtual_samples_25 %>%
                              group_by(replicate) %>%
                              summarize(red = sum(color == "red")) %>%
                              mutate(prop_red = red / n)
virtual_prop_red_25
## # A tibble: 1,000 × 3
##    replicate   red prop_red
##        <int> <int>    <dbl>
##  1         1     7     0.28
##  2         2    13     0.52
##  3         3    11     0.44
##  4         4     9     0.36
##  5         5     5     0.2 
##  6         6    10     0.4 
##  7         7    10     0.4 
##  8         8    11     0.44
##  9         9    10     0.4 
## 10        10     8     0.32
## # ℹ 990 more rows
virtual_prop_red_25 %>% 
  ggplot(aes(x = prop_red)) +
      geom_histogram(binwidth = 0.05, boundary = 0.4, color = "white") +
      labs(x = paste0("Anteil roter Kugeln (aus ",n,")"),
           title = paste0("Kleine Schaufel (n=",n,")")) +
      geom_vline(xintercept = red_true$Anteil_rot,color='red')

“Mittlere Schaufel” (Stichprobengröße n = 50)

n <- 50
virtual_samples_50 <- bowl %>%
                         rep_sample_n(size = n,
                                      reps = rep)
virtual_prop_red_50 <- virtual_samples_50 %>%
                              group_by(replicate) %>%
                              summarize(red = sum(color == "red")) %>%
                              mutate(prop_red = red / n)
virtual_prop_red_50
## # A tibble: 1,000 × 3
##    replicate   red prop_red
##        <int> <int>    <dbl>
##  1         1    19     0.38
##  2         2    17     0.34
##  3         3    21     0.42
##  4         4    22     0.44
##  5         5    15     0.3 
##  6         6    21     0.42
##  7         7    16     0.32
##  8         8    23     0.46
##  9         9    16     0.32
## 10        10    24     0.48
## # ℹ 990 more rows
virtual_prop_red_50 %>% 
  ggplot(aes(x = prop_red)) +
      geom_histogram(binwidth = 0.05, boundary = 0.4, color = "white") +
      labs(x = paste0("Anteil roter Kugeln (aus ",n,")"),
           title = paste0("Kleine Schaufel (n=",n,")")) +
      geom_vline(xintercept = red_true$Anteil_rot,color='red')

“Große Schaufel” (Stichprobengröße n = 100)

n<-100
virtual_samples_100 <- bowl %>%
                          rep_sample_n(size = n,
                                       reps = rep)
virtual_samples_100
## # A tibble: 100,000 × 3
## # Groups:   replicate [1,000]
##    replicate ball_ID color
##        <int>   <int> <chr>
##  1         1    2211 white
##  2         1     946 white
##  3         1     605 white
##  4         1    1818 red  
##  5         1     379 white
##  6         1    1407 white
##  7         1    1368 white
##  8         1    1440 white
##  9         1     232 red  
## 10         1    1611 red  
## # ℹ 99,990 more rows
virtual_prop_red_100 <- virtual_samples_100 %>%
                              group_by(replicate) %>%
                              summarize(red = sum(color == "red")) %>%
                              mutate(prop_red = red / n)
virtual_prop_red_100
## # A tibble: 1,000 × 3
##    replicate   red prop_red
##        <int> <int>    <dbl>
##  1         1    39     0.39
##  2         2    41     0.41
##  3         3    28     0.28
##  4         4    45     0.45
##  5         5    35     0.35
##  6         6    30     0.3 
##  7         7    41     0.41
##  8         8    44     0.44
##  9         9    38     0.38
## 10        10    33     0.33
## # ℹ 990 more rows
virtual_prop_red_100 %>% 
  ggplot(aes(x = prop_red)) +
      geom_histogram(binwidth = 0.05, boundary = 0.4, color = "white") +
      labs(x = paste0("Anteil roter Kugeln (aus ",n,")"),
           title = paste0("Kleine Schaufel (n=",n,")")) +
      geom_vline(xintercept = red_true$Anteil_rot,color='red')

Grundproblem der induktiven Statistik

\(\Rightarrow\) Bedeutung der Zufallsaufwahl bei der Datenerhebung (!)

Wenn der wahre Wert unbekannt ist

(Virtuelles) Ziehen einer Stichprobe

n <- 50
virtual_shovel <- bowl %>% 
                     rep_sample_n(size = n)
virtual_shovel
## # A tibble: 50 × 3
## # Groups:   replicate [1]
##    replicate ball_ID color
##        <int>   <int> <chr>
##  1         1     474 red  
##  2         1     133 white
##  3         1    1063 red  
##  4         1     138 red  
##  5         1     849 white
##  6         1     228 white
##  7         1    1123 white
##  8         1    1347 red  
##  9         1    1545 red  
## 10         1    1457 white
## # ℹ 40 more rows

Anzahl und Anteil der roten Kugeln

virtual_shovel %>% 
  summarize(num_red = sum(color == "red")) %>% 
  mutate(prop_red = num_red /n)
## # A tibble: 1 × 3
##   replicate num_red prop_red
##       <int>   <int>    <dbl>
## 1         1      25      0.5

Statistische Inferenz

Stichprobenverteilung

Reliabilität und Validität

Stichprobenverteilungen…

…und wahrer Wert (rote Linien)

Vergleich der Standardfehler

virtual_prop_red_25 %>%
  summarize(sd = sd(prop_red))
## # A tibble: 1 × 1
##       sd
##    <dbl>
## 1 0.0970
virtual_prop_red_50 %>%
  summarize(sd = sd(prop_red))
## # A tibble: 1 × 1
##       sd
##    <dbl>
## 1 0.0662
virtual_prop_red_100 %>%
  summarize(sd = sd(prop_red))
## # A tibble: 1 × 1
##       sd
##    <dbl>
## 1 0.0470

Intervallschätzung und Konfidenzintervall

Punktschätzung vs. Intervallschätzung

Ermittlung von Konfidenzintervallen

Einzelne Stichprobe aus dem physischen Experiment

bowl_sample_1
## # A tibble: 50 × 1
##    color
##    <chr>
##  1 white
##  2 white
##  3 red  
##  4 red  
##  5 white
##  6 white
##  7 red  
##  8 white
##  9 white
## 10 white
## # ℹ 40 more rows
stats_sample_1 <- bowl_sample_1 %>%
                  summarize(Anzahl_rot=sum(color=='red'),
                            Anteil_rot=sum(color=='red')/
                                                length(color))
stats_sample_1
## # A tibble: 1 × 2
##   Anzahl_rot Anteil_rot
##        <int>      <dbl>
## 1         21       0.42

In der Stichprobe von Ilyas und Yohan befinden sich insgesamt 21 rote Kugeln, d.h. der Anteil roter Kugeln entspricht in ihrer Stichprobe 42%.

Nutzung der Funktionen im infer-Paket

Schritt 1: specify()

library(infer)
#bowl_sample_1 %>%        
#    specify(response = color)   # funktioniert nicht - "success" (also das "Ereignis A") muss definiert werden!

bowl_sample_1 %>%
    specify(response = color, success = "red")
## Response: color (factor)
## # A tibble: 50 × 1
##    color
##    <fct>
##  1 white
##  2 white
##  3 red  
##  4 red  
##  5 white
##  6 white
##  7 red  
##  8 white
##  9 white
## 10 white
## # ℹ 40 more rows

Schritt 2: generate()

bowl_sample_1 %>%
  specify(response = color, success = "red") %>%
  generate(reps = 1000, type = "bootstrap")
## Response: color (factor)
## # A tibble: 50,000 × 2
## # Groups:   replicate [1,000]
##    replicate color
##        <int> <fct>
##  1         1 white
##  2         1 red  
##  3         1 red  
##  4         1 red  
##  5         1 white
##  6         1 red  
##  7         1 white
##  8         1 white
##  9         1 red  
## 10         1 white
## # ℹ 49,990 more rows

Schritt 3: calculate()

sample_1_bootstrap <- bowl_sample_1 %>%
                           specify(response = color,
                                   success = "red") %>%
                           generate(reps = 1000,
                                    type = "bootstrap") %>%
                           calculate(stat = "prop")
sample_1_bootstrap
## Response: color (factor)
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1  0.44
##  2         2  0.4 
##  3         3  0.44
##  4         4  0.4 
##  5         5  0.48
##  6         6  0.34
##  7         7  0.4 
##  8         8  0.48
##  9         9  0.44
## 10        10  0.3 
## # ℹ 990 more rows

Schritt 4: visualize()

## # A tibble: 1 × 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1     0.28     0.56
sample_1_bootstrap %>%
        visualize(bins = 15) +
        shade_confidence_interval(endpoints = percentile_ci_1) +
        geom_vline(xintercept = 0.42, linetype = "dashed")

Interpretation Konfidenzintervall

Perzentil-Methode

Standardfehler-Methode

Stichprobenverteilung vs. Bootstrap-Verteilung

Stichprobenverteilung

## # A tibble: 1 × 1
##       se
##    <dbl>
## 1 0.0694

Bootstrapping-Verteilung

## # A tibble: 1 × 1
##       se
##    <dbl>
## 1 0.0658